'''
Author: gaoxinglong
Date: 2022-11-12 13:47:35
LastEditTime: 2022-11-12 17:27:26
LastEditors: gaoxinglong
'''
import os
import sys
import codecs
import re
from collections import OrderedDict


def read_big_mandrin_lexicon(mandrin_lexicon):
    if not os.path.exists(mandrin_lexicon):
        raise FileExistsError("")
    dict_prons={}
    with codecs.open(mandrin_lexicon, "r", "utf-8") as f:
        line = f.readline().strip("\n\r ")
        while line:
            units = re.split(r"\s+", line)
            if units[0]==units[1]:
                dict_prons[units[0]] = " ".join(units[2:])
            else:
                dict_prons[units[0]] = " ".join(units[1:])
            line = f.readline().strip("\n\r ")
    print("lexicon size():{} read in\n".format(len(dict_prons)))
    return dict_prons


if __name__ == "__main__":
    lexicon, text = sys.argv[1:]
    prons_dict = read_big_mandrin_lexicon(mandrin_lexicon=lexicon)
    fout = codecs.open("{}.pinyin".format(text), "w", "utf-8")
    fbad = codecs.open("{}.to_pinyin_failed".format(text), "w", "utf-8")
    num_failed=0
    with codecs.open(text, "r", encoding='utf8') as f:
        line = f.readline().strip("\n\r ")
        while line:
            lineunits = re.split(r"\s+", line)
            pinyin_units = []
            for word in lineunits[1:]:
                if word in prons_dict:
                    pinyin_units.append(prons_dict[word])
                else:
                    if re.match(r"[a-zA-Z\']", word):
                        en_array = re.split(r"", word)
                        en_array.append("|")
                        pinyin_units.append(" ".join(en_array))
                    else:
                        num_failed+=1
                        fbad.write("{}\n".format(word))
            pinyin_line = " ".join(pinyin_units)
            fout.write("{}\t{}\n".format(lineunits[0], pinyin_line.upper()))
            line = f.readline().strip("\n\r ")
    fout.close()
    fbad.close()
    print("failed number {}\n".format(num_failed))
            
            